likert_conv <- function(x) {
  xl = str_to_lower(x)
  xn = case_when(xl == "strongly disagree" ~ 5,
                 xl == "disagree" ~ 4,
                 xl == "neither agree nor disagree" ~ 3,
                 xl == "agree" ~ 2,
                 xl == "strongly agree" ~ 1)
  return(xn)
}

az_recode <- function(data) {
  new_data <- data %>% 
    mutate(
      pgg_same = as.integer(pgg == country),
      b_av = (pgg_cond_contr_0 +pgg_cond_contr_1+ pgg_cond_contr_2+ pgg_cond_contr_3+ pgg_cond_contr_4 +pgg_cond_contr_5)/6,
      b_var = ((pgg_cond_contr_0-b_av)^2+(pgg_cond_contr_1-b_av)^2+(pgg_cond_contr_2-b_av)^2+(pgg_cond_contr_3-b_av)^2+(pgg_cond_contr_4-b_av)^2+(pgg_cond_contr_5-b_av)^2)/6,
      b_sd = sqrt(b_var),
      b_corr = ifelse(b_var == 0,
                      0,
                      (2.5*( pgg_cond_contr_5 - pgg_cond_contr_0) + 
                         1.5*( pgg_cond_contr_4 - pgg_cond_contr_1) + 
                         0.5*( pgg_cond_contr_3 - pgg_cond_contr_2))/(6* b_sd*1.7082512766)),
      b_corr = round(b_corr, 3),
      peak2 = pgg_cond_contr_0<= pgg_cond_contr_1 & 
        pgg_cond_contr_1>= pgg_cond_contr_2 & 
        pgg_cond_contr_2>= pgg_cond_contr_3 & 
        pgg_cond_contr_3>= pgg_cond_contr_4 & 
        pgg_cond_contr_4>= pgg_cond_contr_5,
      peak3 = pgg_cond_contr_0 <= pgg_cond_contr_1 &
        pgg_cond_contr_1<= pgg_cond_contr_2 &
        pgg_cond_contr_2>= pgg_cond_contr_3 &
        pgg_cond_contr_3>= pgg_cond_contr_4 &
        pgg_cond_contr_4>= pgg_cond_contr_5,
      peak4 = pgg_cond_contr_0<= pgg_cond_contr_1 & 
        pgg_cond_contr_1<= pgg_cond_contr_2 &
        pgg_cond_contr_2<= pgg_cond_contr_3 &
        pgg_cond_contr_3>= pgg_cond_contr_4 &
        pgg_cond_contr_4>= pgg_cond_contr_5,
      peak5 = pgg_cond_contr_0 <= pgg_cond_contr_1 &
        pgg_cond_contr_1<= pgg_cond_contr_2 &
        pgg_cond_contr_2<= pgg_cond_contr_3 &
        pgg_cond_contr_3<= pgg_cond_contr_4 & 
        pgg_cond_contr_4>= pgg_cond_contr_5,
      across(all_of(c("peak2","peak3","peak4","peak5")), as.integer),
      
      b_av3L= (pgg_cond_contr_0+ pgg_cond_contr_1+ pgg_cond_contr_2)/3,
      b_av4L=( pgg_cond_contr_0+ pgg_cond_contr_1+ pgg_cond_contr_2+ pgg_cond_contr_3 )/4,
      b_av3R=( pgg_cond_contr_2+ pgg_cond_contr_3 + pgg_cond_contr_4+ pgg_cond_contr_5 )/4,
      b_av4R=(pgg_cond_contr_3 + pgg_cond_contr_4+ pgg_cond_contr_5 )/3,
      
      b_sd3L=sqrt((( pgg_cond_contr_0- b_av3L)^2+( pgg_cond_contr_1- b_av3L)^2+ ( pgg_cond_contr_2- b_av3L)^2)/3),
      b_sd4L=sqrt((( pgg_cond_contr_0- b_av4L)^2+( pgg_cond_contr_1- b_av4L)^2+ ( pgg_cond_contr_2- b_av4L)^2+ ( pgg_cond_contr_3- b_av4L)^2)/4),
      b_sd3R=sqrt((( pgg_cond_contr_2- b_av3R)^2+( pgg_cond_contr_3- b_av3R)^2+ ( pgg_cond_contr_4- b_av3R)^2+ ( pgg_cond_contr_5- b_av3R)^2)/4),
      b_sd4R=sqrt((( pgg_cond_contr_3- b_av4R)^2+ ( pgg_cond_contr_4- b_av4R)^2+ ( pgg_cond_contr_5- b_av4R)^2)/3),
      
      b_corr3L=(pgg_cond_contr_2-pgg_cond_contr_0)/(3* b_sd3L*sqrt(2/3)),
      b_corr4L=(1.5*(pgg_cond_contr_3-pgg_cond_contr_0)+0.5*(pgg_cond_contr_2-pgg_cond_contr_1))/(4* b_sd4L*sqrt(1.25)),
      b_corr3R=(1.5*(pgg_cond_contr_5-pgg_cond_contr_2)+0.5*(pgg_cond_contr_4-pgg_cond_contr_3))/(4* b_sd3R*sqrt(1.25)),
      b_corr4R=(pgg_cond_contr_5-pgg_cond_contr_3)/(3* b_sd4R*sqrt(2/3)),
      
      b_corr3L = ifelse(b_sd3L==0,0,b_corr3L),
      b_corr4L = ifelse(b_sd4L==0,0,b_corr4L),
      b_corr3R = ifelse(b_sd3R==0,0,b_corr3R),
      b_corr4R = ifelse(b_sd4R==0,0,b_corr4R),
      
      coop_type1 = case_when(pgg_cond_contr_0 + 
                               pgg_cond_contr_1 + 
                               pgg_cond_contr_2 + 
                               pgg_cond_contr_3 + 
                               pgg_cond_contr_4 + 
                               pgg_cond_contr_5 == 0 ~ 1,
                             
                             pgg_cond_contr_0 == pgg_cond_contr_1 &
                               pgg_cond_contr_1 == pgg_cond_contr_2 &
                               pgg_cond_contr_2 == pgg_cond_contr_3 &
                               pgg_cond_contr_3 == pgg_cond_contr_4 &
                               pgg_cond_contr_4 == pgg_cond_contr_5 ~ 2,
                             
                             (pgg_cond_contr_0<=pgg_cond_contr_1 &
                                pgg_cond_contr_1<=pgg_cond_contr_2 &
                                pgg_cond_contr_2<=pgg_cond_contr_3 &
                                pgg_cond_contr_3<=pgg_cond_contr_4 &
                                pgg_cond_contr_4<=pgg_cond_contr_5) | b_corr>=.5 ~ 3,
                             
                             peak2 == 1 | peak3 == 1 | peak4 == 1 | peak5 == 1 ~ 4,
                             b_corr3L >= 0.5 & b_corr3R <=-0.5 ~ 4,
                             b_corr4L >= 0.5 & b_corr4R <=-0.5 ~ 4,
                             TRUE ~ 99),
      coop_type1_txt = case_when(coop_type1 == 1 ~ "Free rider",
                                 coop_type1 == 2 ~ "Unconditional cooperator",
                                 coop_type1 == 3 ~ "Conditional cooperator",
                                 coop_type1 == 4 ~ "Triangular",
                                 coop_type1 == 99 ~ "Other"),
      
      # female = as.integer(gender == "Female"),
      age_cat = case_when(age<30 ~ "18-29",
                          age>=30&age<40 ~ "30-39",
                          age>=40&age<50 ~ "40-49",
                          age>=50&age<60 ~ "50-59",
                          age>=60&age<70 ~ "60-69",
                          age>=70 ~ "70+"),
      
      inc_change = case_when(!is.na(eco_out_11) ~ -eco_out_11,
                             eco_out_9 == "Stayed the same" ~ 0,
                             TRUE ~ eco_out_10),
      
      eq5d_6 = ifelse(eq5d_6>100, NA, eq5d_6),
      eq5d_7 =  ifelse(eq5d_7>100, NA, eq5d_7),
      eq5_change = eq5d_6- eq5d_7,
      
      redist_gov=(5- likert_conv(eco_att_3_1))/4,
      redist_incdiff=(5- likert_conv(eco_att_3_2))/4,
      redist_fraud=(5- likert_conv(eco_att_3_3))/4,
      redist_unemp=(5- likert_conv(eco_att_3_4))/4,
      redist_housing=(5- likert_conv(eco_att_3_5))/4,
      redist_prog=(5- likert_conv(eco_att_3_6))/4,
      eco_att_2_n = case_when(eco_att_2 == "Hard work is most important" ~ 1,
                              eco_att_2 == "Hard work and luck are equally important" ~ 2,
                              eco_att_2 == "Luck is most important" ~ 3),
      
      workluck= ( eco_att_2_n-1)/2,
      risk = eco_att_1,
      eco_att_7_n = case_when(eco_att_7 == "Today" ~ 1,
                              eco_att_7 == "In 12 months" ~ 2),
      eco_att_8_n = case_when(eco_att_8 == "Today" ~ 1,
                              eco_att_8 == "In 12 months" ~ 2),
      eco_att_9_n = case_when(eco_att_9 == "Today" ~ 1,
                              eco_att_9 == "In 12 months" ~ 2),
      eco_att_10_n = case_when(eco_att_10 == "Today" ~ 1,
                               eco_att_10 == "In 12 months" ~ 2),
      patience = case_when(!is.na(eco_att_8_n) ~ eco_att_8_n+2,
                           !is.na(eco_att_9_n) ~ eco_att_9_n+4,
                           !is.na(eco_att_10_n) ~ eco_att_10_n+6,
                           TRUE ~ eco_att_7_n),
      
      INCOME_own=ifelse(is.na(INCOME_HIGH), INCOME_LOW, (INCOME_LOW+INCOME_HIGH)/2),
      l_inc=log(INCOME_own),
      inc_na = as.integer(is.na(l_inc)),
      # l_inc = ifelse(is.na(l_inc),0,l_inc), ##*##
      
      INCOME_hh = ifelse(is.na(INCOME_HH_HIGH), INCOME_HH_LOW, (INCOME_HH_LOW+INCOME_HH_HIGH)/2),
      INCOME_hh = ifelse(HH_size > 20, NA, INCOME_hh/sqrt(HH_size+1)),
      l_inc_hh = log(INCOME_hh),
      # l_inc_hh = ifelse(is.na(log(INCOME_hh)),0,log(INCOME_hh)), ##*##
      inc_hh_na = as.integer(is.na(log(INCOME_hh))),
      
      educ = case_when(EDUCATION_LEVEL %in% c("Primary completed","Less than primary completed") ~ "Low",
                       EDUCATION_LEVEL == "Secondary completed" ~ "Medium",
                       EDUCATION_LEVEL == "University completed" ~ "High"),
      
      across(
        .cols = starts_with("vac_hes_4_"),
        .fns = function (x) as.integer(x == "Yes"),
        .names = "covidexp_{.col}"
      )) %>% 
    rename_with(function (x) gsub("vac_hes_4_","",x), starts_with("covidexp_vac_hes_4_")) %>% 
    mutate(covidexp_index = (covidexp_1 + 
                               covidexp_2 + 
                               covidexp_3 + 
                               covidexp_4 + 
                               covidexp_5)/5) %>% 
    group_by(country) %>% 
    mutate(covidexp_index_med = median(covidexp_index),
           covidexp_index_abovemed = as.integer(covidexp_index> covidexp_index_med)) %>% 
    ungroup() %>% 
    filter(!duplicated(id)) %>% 
    rename(deaths_db = deaths) %>% 
    left_join(as_factor(read_dta("additional_data/distance_pgg.dta")), by = c("country","pgg")) %>% 
    left_join(as_factor(read_dta("additional_data/cow_pgg.dta")), by = c("country","pgg")) %>%
    left_join(as_factor(read_dta("additional_data/gdpown_pgg.dta")), by = "country") %>%
    left_join(as_factor(read_dta("additional_data/gdpother_pgg.dta")), by = "pgg") %>%
    left_join(as_factor(read_dta("additional_data/geneticdist_pgg.dta")), by = c("country","pgg")) %>%
    left_join(as_factor(read_dta("additional_data/valown_pgg.dta")), by = "country") %>%
    left_join(as_factor(read_dta("additional_data/valother_pgg.dta")), by = "pgg") %>%
    left_join(as_factor(read_dta("additional_data/trade_pgg.dta")), by = c("country","pgg")) %>% 
    
    mutate(pgg_cow_war = as.integer(!is.na(cow_fatlev)),
           pgg_dist_trasec = abs( pgg_own_trasec- pgg_other_trasec),
           pgg_dist_surexp =abs( pgg_own_surexp- pgg_other_surexp),
           pgg_cult_dist = sqrt(( pgg_own_trasec- pgg_other_trasec)^2+( pgg_own_surexp- pgg_other_surexp)^2),
           
           pgg_geneticdist = ifelse(is.na(pgg_geneticdist), 0, pgg_geneticdist), ##*##
           pgg_geneticdist = ifelse(is.na(pgg), NA, pgg_geneticdist),
           pgg_geneticdist = pgg_geneticdist/1000,
           
           trade_share = ifelse(is.na(trade_share), 0, trade_share)) %>% ##*##
    rename(pgg_trade_share = trade_share,
           pgg_trade_share_l = trade_share_l) %>% 
    
    mutate(relmort2021 =(deaths2021- deaths2019)/ deaths2019,
           relmort2020 = ( deaths2020- deaths2019)/ deaths2019,
           absmort2021 = ( deaths2021- deaths2019)/ pop2019,
           absmort2020 = ( deaths2020- deaths2019)/ pop2019,
           income_abovemed = as.integer(
             ((INCOME_LOW+INCOME_HIGH)/2 > INCOME_MED) | (!is.na(INCOME_LOW) & is.na(INCOME_HIGH))
           ),
           income_belowmed = as.integer(((INCOME_LOW+INCOME_HIGH)/2) < INCOME_MED),
           income_belowmed = ifelse(is.na(INCOME_HIGH), 0, income_belowmed), ##*## Needed as incl. cases of top bracket earners
           income_abovemed = ifelse(is.na(INCOME_LOW),0, income_abovemed), ##*##
           ## TSR/YC change
           income_belowmed = ifelse(is.na(INCOME_LOW) & is.na(INCOME_HIGH),NA, income_belowmed),
           income_abovemed = ifelse(is.na(INCOME_LOW) & is.na(INCOME_HIGH),NA, income_abovemed),
           
           pgg_income_sim= -abs( log(pgg_own_gdp)- log(pgg_other_gdp)),
           pgg_own_highinc = as.integer(country %in% c("Australia",
                                                       "Canada",
                                                       "Chile",
                                                       "France",
                                                       "Italy",
                                                       "Japan",
                                                       "Spain",
                                                       "UK",
                                                       "US")),
           pgg_other_lowinc = case_when(pgg == "Uganda" ~ 1,
                                        TRUE ~ 0),
           pgg_highlow = pgg_own_highinc*pgg_other_lowinc,
           
           pgg_distance_km_l=log( pgg_distance_km+1),
           
           pgg_geneticdist = ifelse(is.na(pgg_geneticdist), 0, pgg_geneticdist), ##*##
           covid_geneticdist = covidexp_index * pgg_geneticdist,
           covid_cultdist = covidexp_index * pgg_cult_dist,
           covid_highlow = covidexp_index * pgg_highlow,
           covid_trade = covidexp_index * pgg_trade_share,
           covid_dist = covidexp_index * pgg_distance_km_l,
           covid_cow = covidexp_index * pgg_cow_war,
           covid_similar = covidexp_index * pgg_income_sim,
           covid_same = covidexp_index * pgg_same
    )
  
  return(new_data)
}

compare_vars <- function(var, stata_data, r_data, tol) {
  
  if (is.null(stata_data[[var]])) {
    return("Missing in AZ")
  }
  
  test_data <- stata_data[,c("id","country_str",var)] %>% 
    mutate(c_id = paste0(country_str,"_",id)) %>% 
    left_join({r_data[,c("id","country",var)] %>% mutate(c_id = paste0(country,"_",id))},
              by = "c_id")
  
  test_data$x <- test_data[[paste0(var,".x")]]
  test_data$y <- test_data[[paste0(var,".y")]]
  
  if (!is.numeric(test_data$y)) {
    test_data$x <- as_factor(test_data$x)
    test_data$y <- as.numeric(factor(test_data$y, levels = levels(test_data$x)))
    test_data$x <- as.numeric(test_data$x)
  }
  
  test_data$check <- abs(test_data$x - test_data$y) <= tol

  return(mean(test_data$check, na.rm = TRUE))
}

check_data <- function(raw_data, stata_data, r_data, tol = 1e-1) {
  new_vars <- colnames(new_data)[!(colnames(new_data) %in% colnames(raw_data))]
  return(sapply(new_vars, function(x) compare_vars(x, stata_data, r_data, tol)))
}

  

## < 1% difference for coop_type1 but according to Stata code, should be same as R
## Error in Stata code for l_inc_hh_na
## Not merging conjoint data at present
